setwd("/Users/vh3/Documents/Tryps/ANALYSIS_2")
require("Matrix")
library(scater, quietly = TRUE)
require("SingleCellExperiment")
options(stringsAsFactors = FALSE)
library(plotly)
library(ggplot2)
library(SC3)
molecules <- read.table("/Users/vh3/Documents/Tryps/Expression_matrices_927/TCAcounts927.csv", header = TRUE, sep = ",", row.names=1, stringsAsFactors = TRUE)
anno <- read.delim("/Users/vh3/Documents/Tryps/Expression_matrices_927/TCA_pheno.csv", header = TRUE, sep = ",", row.names = 1)
table(colnames(molecules)==anno$sample_id)
##
## TRUE
## 640
anno <- anno[match(colnames(molecules), anno$sample_id), ]
mca <- SingleCellExperiment(assays = list(
counts = as.matrix(molecules),
logcounts = log2(as.matrix(molecules) + 1)
), colData = anno)
CellQC <- perCellQCMetrics(mca)
FeatQC <- perFeatureQCMetrics(mca)
colData(mca) <- cbind(colData(mca), CellQC)
rowData(mca) <- cbind(rowData(mca), FeatQC)
data <- as.data.frame(colData(mca))
tapply(mca$total, data$run_number, sum)
## 28745_8 28959_8 29748_4
## 28900111 23809889 24605966
#Inspect poor quality cells
mca <- mca[, mca$num_cells == "SC"]
median(colData(mca)$detected)
## [1] 634
tab <- as.data.frame(colData(mca))
ggplot(tab, aes(x=detected, fill = run_number)) + geom_histogram(binwidth = 25) + geom_vline(xintercept = 40, col ="navy")

ggplot(tab, aes(x=detected, fill = run_number)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 40, col ="navy") + facet_grid(ShortenedLifeStage~., scales="free")

ggplot(tab, aes(x=detected, fill = attachment)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 40, col ="navy") + geom_vline(xintercept = 3000, col ="navy") + facet_grid(ShortenedLifeStage~., scales="free")

tab$ShortenedLifeStage <- factor(tab$ShortenedLifeStage, levels=c("PC", "MG", "PV", "SG"))
cols2 <- c("DSP"="aquamarine", "hypotherm"="dodgerblue4", "live"="mediumorchid1")
ggplot(tab, aes(x=detected, fill = fixation)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 40, col ="firebrick1") + geom_vline(xintercept = 3000, col ="firebrick1") + facet_grid(ShortenedLifeStage~., scales="free") + theme_bw() + scale_fill_manual(values=cols2)

ggplot(tab, aes(x=sum, fill = fixation)) + geom_histogram(bins = 100) + geom_vline(xintercept = 1000, col ="navy") + facet_grid(ShortenedLifeStage~., scales="free")

tabnoPV <- tab[tab$ShortenedLifeStage!="PV", ]
ggplot(tabnoPV, aes(x=detected, fill = fixation)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 40, col ="firebrick1") + geom_vline(xintercept = 3000, col ="firebrick1") + facet_grid(ShortenedLifeStage~., scales="free") + theme_bw() + scale_fill_manual(values=cols2)

mod <- lm(tabnoPV$detected~tabnoPV$fixation*tabnoPV$ShortenedLifeStage)
anova(mod)
## Analysis of Variance Table
##
## Response: tabnoPV$detected
## Df Sum Sq Mean Sq F value
## tabnoPV$fixation 2 6665280 3332640 6.0877
## tabnoPV$ShortenedLifeStage 2 45180707 22590354 41.2656
## tabnoPV$fixation:tabnoPV$ShortenedLifeStage 2 6888245 3444123 6.2913
## Residuals 570 312039546 547438
## Pr(>F)
## tabnoPV$fixation 0.002421 **
## tabnoPV$ShortenedLifeStage < 2.2e-16 ***
## tabnoPV$fixation:tabnoPV$ShortenedLifeStage 0.001983 **
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tabnoPV$ag <- paste(tabnoPV$ShortenedLifeStage, tabnoPV$fixation, sep="_")
tf <- tapply(tabnoPV$detected, tabnoPV$ag, median)
tf
## MG_DSP MG_live PC_DSP PC_live SG_DSP SG_hypotherm
## 715.0 387.0 752.5 1583.5 256.0 570.0
## SG_live
## 403.0
tabSG <- tab[tab$ShortenedLifeStage=="SG", ]
ggplot(tabSG, aes(x=detected, fill = fixation)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 40, col ="firebrick1") + geom_vline(xintercept = 3000, col ="firebrick1") + theme_bw() + scale_fill_manual(values=cols2)

mod <- lm(tabSG$detected~tabSG$fixation)
anova(mod)
## Analysis of Variance Table
##
## Response: tabSG$detected
## Df Sum Sq Mean Sq F value Pr(>F)
## tabSG$fixation 2 15413512 7706756 13.194 2.911e-06 ***
## Residuals 372 217291685 584117
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tf <- tapply(tabSG$detected, tabSG$fixation, median)
tf
## DSP hypotherm live
## 256 570 403
meds <- tapply(colData(mca)$detected, colData(mca)$ShortenedLifeStage, median)
meds
## MG PC PV SG
## 555.5 1234.0 1045.5 476.0
ggplot(tab, aes(x=sum, fill = time)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sgonly <- mca[ , mca$ShortenedLifeStage == "SG"]
sgtab <- as.data.frame(colData(sgonly))
ggplot(sgtab, aes(x=detected, fill = attachment)) + geom_histogram(binwidth = 50) + geom_vline(xintercept = 250, col ="navy")

# Filter cells with low counts
filter_by_sum <- (mca$sum > 1000)
table(filter_by_sum)
## filter_by_sum
## FALSE TRUE
## 130 493
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mca$detected > 40)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE TRUE
## 101 522
filter_by_expr_features_high <- (mca$detected < 3000)
table(filter_by_expr_features_high)
## filter_by_expr_features_high
## FALSE TRUE
## 10 613
# filter out control samples
#filter_by_control <- mca$num_cells == "SC"
#table(filter_by_control)
# Filter data
mca$use <- (
# sufficient features (genes)
filter_by_expr_features &
filter_by_expr_features_high &
# sufficient molecules counted
filter_by_sum
# controls shouldn't be used in downstream analysis
#filter_by_control
)
table(mca$use)
##
## FALSE TRUE
## 141 482
table(mca$use, mca$ShortenedLifeStage)
##
## MG PC PV SG
## FALSE 16 14 12 99
## TRUE 78 94 34 276
tab <- table(mca$use, mca$plate_name)
#write.csv(tab, file="QC_250genes1000counts_20191029.csv")
mca.qc.cells <- mca[ , colData(mca)$use]
#Identifying outliers by PCA
mca.qc.cells <- runPCA(mca.qc.cells)
plotPCA(mca.qc.cells,
size_by = "detected",
colour_by = "plate_name")

#Looks at the distribution of reads for expression levels of genes
#scater::plotQC(mca, type = "highest-expression")
# Gene filtering
filter_genes <- apply(counts(mca[ , colData(mca)$use]), 1, function(x) length(x[x >= 1]) >= 2)
table(filter_genes)
## filter_genes
## FALSE TRUE
## 1849 9225
rowData(mca)$use <- filter_genes
dim(mca[rowData(mca)$use, colData(mca)$use])
## [1] 9225 482
assay(mca, "logcounts_raw") <- log2(counts(mca) + 1)
reducedDim(mca) <- NULL
mca.qc <- mca[rowData(mca)$use, colData(mca)$use]
mean(mca.qc$detected)
## [1] 977.0249
median(mca.qc$detected)
## [1] 830.5
#saveRDS(mca.qc, file="tca.qc_20200519.rds")
data <- as.data.frame(colData(mca.qc))
tapply(data$sum, data$run_number, sum)
## 28745_8 28959_8 29748_4
## 28576820 22450199 23161352
tapply(data$detected, data$ShortenedLifeStage, median)
## MG PC PV SG
## 692 1297 1315 676
assay(mca.qc, "counts_raw") <- counts(mca.qc)
logcounts(mca.qc) <- logNormCounts(mca.qc)
cpm(mca.qc) <- calculateCPM(mca.qc)
assay(mca.qc, "log_cpm") <- log2(calculateCPM(mca.qc) + 1)
mca.qc <- runPCA(mca.qc, exprs_values = "log_cpm")
plotPCA(mca.qc, colour_by="plate_name", size_by="detected")

mca.qc <- runUMAP(mca.qc, exprs_values = "log_cpm")
plotUMAP(mca.qc, colour_by="ShortenedLifeStage", size_by="detected")

plotPCA(mca.qc, colour_by="ShortenedLifeStage", size_by="detected")

plotUMAP(mca.qc, colour_by="strain", size_by="detected")

set.seed(222)
mca.qc <- runTSNE(mca.qc, exprs_values = "log_cpm")
plotTSNE(mca.qc, colour_by="ShortenedLifeStage", size_by="detected")

#saveRDS(mca.qc, file="tryps.qc.927_20200509.rds")